1 Machine Learning Fundamentals

Last compiled: 2024-06-26

# install.packages("plotly")

library(tidyverse)
library(tidyquant)
library(broom)
library(umap)
library(ggrepel)
# STOCK PRICES
sp_500_prices_tbl <- read_rds("sp_500_prices_tbl.rds")
sp_500_prices_tbl
# SECTOR INFORMATION
sp_500_index_tbl <- read_rds("raw_data/sp_500_index_tbl.rds")
sp_500_index_tbl
# Apply your data transformation skills!
sp_500_daily_returns_tbl<- sp_500_prices_tbl%>%
    select(symbol,date,adjusted)%>%
    filter(lubridate::year(date)>=2018)%>%
    group_by(symbol)%>%
    mutate(lag=lag(adjusted,1))%>%
    drop_na(lag)%>%
    mutate(diff=adjusted-lag)%>%
    mutate(pct_return=diff/lag)%>%
    select(symbol,date,pct_return)
# Output: sp_500_daily_returns_tbl
sp_500_daily_returns_tbl
sp_500_daily_returns_tbl <- read_rds("raw_data/sp_500_daily_returns_tbl.rds")
sp_500_daily_returns_tbl
# Convert to User-Item Format
stock_date_matrix_tbl<-sp_500_daily_returns_tbl%>%
    pivot_wider(names_from = date , values_from=pct_return, values_fill=0)
# Output: stock_date_matrix_tbl
stock_date_matrix_tbl
stock_date_matrix_tbl <- read_rds("raw_data/stock_date_matrix_tbl.rds")
stock_date_matrix_tbl
# Create kmeans_obj for 4 centers
kmeans_obj<-stock_date_matrix_tbl%>%
    select(2:10)%>%
    kmeans(centers = 4,nstart = 20)
# Apply glance() to get the tot.withinss
broom::glance(kmeans_obj)
kmeans_mapper <- function(center = 3) {
    stock_date_matrix_tbl %>%
        select(-symbol) %>%
        kmeans(centers = center, nstart = 20)
}
# Use purrr to map
k_means_mapped_tbl<-tibble(centers=1:30)%>%
    mutate(k_means = map(centers, kmeans_mapper))%>%
    mutate(glance  = map(k_means,glance))
# Output: k_means_mapped_tbl 
k_means_mapped_tbl
# Visualize Scree Plot

k_means_mapped_tbl %>%
    unnest(glance) %>%
    select(centers, tot.withinss)%>%
    ggplot(aes(centers, tot.withinss))+
    geom_point(color = "#2DC6D6", size = 4) +
    geom_line(color = "#2DC6D6", size = 1)+
    ggrepel::geom_label_repel(aes(label = centers), color = "#2DC6D6")+ 
    theme_light()+
    labs(title = "Scree Plot")

k_means_mapped_tbl <- read_rds("raw_data/k_means_mapped_tbl.rds")
# Apply UMAP
umap_results<-stock_date_matrix_tbl%>%
    select(-symbol)%>%
    umap()
    
# Store results as: umap_results 
# Convert umap results to tibble with symbols
umap_results_tbl <- umap_results$layout %>%
    as_tibble()%>%
    bind_cols(
        stock_date_matrix_tbl %>% select(symbol)
    )
    

# Output: umap_results_tbl
umap_results_tbl
# Visualize UMAP results
umap_results_tbl %>%
    ggplot(aes(x=V1, y=V2)) +
    geom_point(alpha=0.5) + 
    theme_tq()+
    geom_label_repel(aes(label = symbol), size = 3)+
    labs(title = "UMAP Projection")

k_means_mapped_tbl <- read_rds("raw_data/k_means_mapped_tbl.rds")
umap_results_tbl   <- read_rds("raw_data/umap_results_tbl.rds")
# Get the k_means_obj from the 10th center
k_means_obj <- k_means_mapped_tbl %>%
    pull(k_means) %>%
    pluck(10)
# Store as k_means_obj
# Use your dplyr & broom skills to combine the k_means_obj with the umap_results_tbl
umap_kmeans_results_tbl<-k_means_obj%>%
    augment(stock_date_matrix_tbl) %>%
    select(symbol, .cluster) %>%
    left_join(umap_results_tbl, by= "symbol")%>%
    left_join(sp_500_index_tbl %>% select(symbol, company, sector), by="symbol")
# Output: umap_kmeans_results_tbl
umap_kmeans_results_tbl
# Visualize the combined K-Means and UMAP results
umap_kmeans_results_tbl%>%
        ggplot(aes(V1, V2, color = .cluster)) +
        geom_point(alpha=0.5) +
    scale_color_manual(values=c("#2d72d6", "#2dc6d6", "#2dd692", "#d62dc5", "#d62d30","#ced62d", "#60d62d", "#571341", "#d62dc5", "#6e5552"))